library(plotly)
## Loading required package: ggplot2
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ tibble 3.1.4 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.3 ✓ stringr 1.4.0
## ✓ readr 2.0.1 ✓ forcats 0.5.1
## ✓ purrr 0.3.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks plotly::filter(), stats::filter()
## x dplyr::lag() masks stats::lag()
library(tidyverse)
health_full <- read_csv("https://chronicdata.cdc.gov/api/views/swc5-untb/rows.csv?accessType=DOWNLOAD&bom=true&format=true")
## Rows: 176008 Columns: 23
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (14): StateAbbr, StateDesc, LocationName, DataSource, Category, Measure,...
## dbl (6): Year, Data_Value, Low_Confidence_Limit, High_Confidence_Limit, Lat...
## lgl (2): Data_Value_Footnote_Symbol, Data_Value_Footnote
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(health_full)
## # A tibble: 6 × 23
## Year StateAbbr StateDesc LocationName DataSource Category Measure
## <dbl> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 2018 US United States <NA> BRFSS Prevention Current lack…
## 2 2018 AL Alabama Bibb BRFSS Health Ou… Cancer (excl…
## 3 2018 AL Alabama Bullock BRFSS Health Ou… Cancer (excl…
## 4 2018 AL Alabama Choctaw BRFSS Health Ou… Current asth…
## 5 2018 AL Alabama Cleburne BRFSS Unhealthy… Binge drinki…
## 6 2018 AL Alabama Cleburne BRFSS Unhealthy… Obesity amon…
## # … with 16 more variables: Data_Value_Unit <chr>, Data_Value_Type <chr>,
## # Data_Value <dbl>, Data_Value_Footnote_Symbol <lgl>,
## # Data_Value_Footnote <lgl>, Low_Confidence_Limit <dbl>,
## # High_Confidence_Limit <dbl>, TotalPopulation <dbl>, Latitude <dbl>,
## # Longitude <dbl>, LocationID <chr>, CategoryID <chr>, MeasureId <chr>,
## # DataValueTypeID <chr>, Short_Question_Text <chr>, geolocation <chr>
year <- 2017:2020
quarters <- 1:4
type <- "Electric"
pge_elec <- NULL
for(quarter in quarters) {
filename <-
paste0(
"pge/PGE_",
year,
"_Q",
quarter,
"_",
type,
"UsageByZip.csv"
)
print(filename)
temp <- read_csv(filename)
pge_elec <- rbind(pge_elec,temp)
# Note rbind requires field names to be consistent for every new thing that you add.
saveRDS(pge_elec, "pge_elec.rds")
}
## [1] "pge/PGE_2017_Q1_ElectricUsageByZip.csv"
## [2] "pge/PGE_2018_Q1_ElectricUsageByZip.csv"
## [3] "pge/PGE_2019_Q1_ElectricUsageByZip.csv"
## [4] "pge/PGE_2020_Q1_ElectricUsageByZip.csv"
## Rows: 31363 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): CUSTOMERCLASS, COMBINED
## dbl (3): ZIPCODE, MONTH, YEAR
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## [1] "pge/PGE_2017_Q2_ElectricUsageByZip.csv"
## [2] "pge/PGE_2018_Q2_ElectricUsageByZip.csv"
## [3] "pge/PGE_2019_Q2_ElectricUsageByZip.csv"
## [4] "pge/PGE_2020_Q2_ElectricUsageByZip.csv"
## Rows: 31377 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): CUSTOMERCLASS, COMBINED
## dbl (3): ZIPCODE, MONTH, YEAR
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## [1] "pge/PGE_2017_Q3_ElectricUsageByZip.csv"
## [2] "pge/PGE_2018_Q3_ElectricUsageByZip.csv"
## [3] "pge/PGE_2019_Q3_ElectricUsageByZip.csv"
## [4] "pge/PGE_2020_Q3_ElectricUsageByZip.csv"
## Rows: 31394 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): CUSTOMERCLASS, COMBINED
## dbl (3): ZIPCODE, MONTH, YEAR
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## [1] "pge/PGE_2017_Q4_ElectricUsageByZip.csv"
## [2] "pge/PGE_2018_Q4_ElectricUsageByZip.csv"
## [3] "pge/PGE_2019_Q4_ElectricUsageByZip.csv"
## [4] "pge/PGE_2020_Q4_ElectricUsageByZip.csv"
## Rows: 33975 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): CUSTOMERCLASS, COMBINED
## dbl (3): ZIPCODE, MONTH, YEAR
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
pge_filter <- filter(pge_elec, CUSTOMERCLASS %in% c("Elec- Residential","Elec- Commercial"))
names(pge_filter)
## [1] "ZIPCODE" "MONTH" "YEAR" "CUSTOMERCLASS"
## [5] "COMBINED" "TOTALCUSTOMERS" "TOTALKWH" "AVERAGEKWH"
head(pge_filter)
## # A tibble: 6 × 8
## ZIPCODE MONTH YEAR CUSTOMERCLASS COMBINED TOTALCUSTOMERS TOTALKWH AVERAGEKWH
## <dbl> <dbl> <dbl> <chr> <chr> <dbl> <dbl> <dbl>
## 1 93101 1 2017 Elec- Commercial Y 0 0 NA
## 2 93101 2 2017 Elec- Commercial Y 0 0 NA
## 3 93101 3 2017 Elec- Commercial Y 0 0 NA
## 4 93105 1 2017 Elec- Commercial Y 0 0 NA
## 5 93105 2 2017 Elec- Commercial Y 0 0 NA
## 6 93105 3 2017 Elec- Commercial Y 0 0 NA
pge_select <-
select(
pge_filter,
)
pge_select <-
select(
pge_filter,
-c(COMBINED, AVERAGEKWH)
)
pge_group <-
group_by(
pge_select,
YEAR,
MONTH,
CUSTOMERCLASS
)
pge_summarize <-
summarize(
pge_group,
TOTALKWH =
sum(
TOTALKWH,
na.rm = T
)
)
## `summarise()` has grouped output by 'YEAR', 'MONTH'. You can override using the `.groups` argument.
pge_wide <-
pivot_wider(
pge_summarize,
names_from = CUSTOMERCLASS,
values_from = TOTALKWH
)
pge_wide
## # A tibble: 48 × 4
## # Groups: YEAR, MONTH [48]
## YEAR MONTH `Elec- Commercial` `Elec- Residential`
## <dbl> <dbl> <dbl> <dbl>
## 1 2017 1 2518133184 2696632298
## 2 2017 2 2186409895 2089405296
## 3 2017 3 2328765919 2007875145
## 4 2017 4 2260528589 1838107707
## 5 2017 5 2472760712 2070656073
## 6 2017 6 2579187737 2615607826
## 7 2017 7 2808792813 3113124827
## 8 2017 8 2925396026 2995761408
## 9 2017 9 5462119756 5072659238
## 10 2017 10 2739510203 1956786021
## # … with 38 more rows
pge_tidy <-
pivot_longer(
pge_wide,
c("Elec- Commercial", "Elec- Residential"),
names_to = "CUSTOMERCLASS",
values_to = "TOTALKWH"
)
pge_tidy
## # A tibble: 96 × 4
## # Groups: YEAR, MONTH [48]
## YEAR MONTH CUSTOMERCLASS TOTALKWH
## <dbl> <dbl> <chr> <dbl>
## 1 2017 1 Elec- Commercial 2518133184
## 2 2017 1 Elec- Residential 2696632298
## 3 2017 2 Elec- Commercial 2186409895
## 4 2017 2 Elec- Residential 2089405296
## 5 2017 3 Elec- Commercial 2328765919
## 6 2017 3 Elec- Residential 2007875145
## 7 2017 4 Elec- Commercial 2260528589
## 8 2017 4 Elec- Residential 1838107707
## 9 2017 5 Elec- Commercial 2472760712
## 10 2017 5 Elec- Residential 2070656073
## # … with 86 more rows
pge_summarize <-
summarize(
pge_group,
TOTALKWH =
sum(
TOTALKWH,
na.rm = T
),
TOTALCUSTOMERS =
sum(
TOTALCUSTOMERS,
na.rm = T
)
)
## `summarise()` has grouped output by 'YEAR', 'MONTH'. You can override using the `.groups` argument.
pge_mutate <-
mutate(
pge_summarize,
AVERAGEKWH =
TOTALKWH/TOTALCUSTOMERS
) %>%
mutate(
DATE2 =
paste(
YEAR,
MONTH,
"01",
sep="-"
) %>% as.Date()
)
pge_mutate
## # A tibble: 96 × 7
## # Groups: YEAR, MONTH [48]
## YEAR MONTH CUSTOMERCLASS TOTALKWH TOTALCUSTOMERS AVERAGEKWH DATE2
## <dbl> <dbl> <chr> <dbl> <dbl> <dbl> <date>
## 1 2017 1 Elec- Commercial 2518133184 312242 8065. 2017-01-01
## 2 2017 1 Elec- Residential 2696632298 4427009 609. 2017-01-01
## 3 2017 2 Elec- Commercial 2186409895 312327 7000. 2017-02-01
## 4 2017 2 Elec- Residential 2089405296 4429320 472. 2017-02-01
## 5 2017 3 Elec- Commercial 2328765919 312586 7450. 2017-03-01
## 6 2017 3 Elec- Residential 2007875145 4432096 453. 2017-03-01
## 7 2017 4 Elec- Commercial 2260528589 313499 7211. 2017-04-01
## 8 2017 4 Elec- Residential 1838107707 4437095 414. 2017-04-01
## 9 2017 5 Elec- Commercial 2472760712 313795 7880. 2017-05-01
## 10 2017 5 Elec- Residential 2070656073 4438946 466. 2017-05-01
## # … with 86 more rows
# piping practice
pge_final_elec <-
pge_elec %>%
filter(
CUSTOMERCLASS %in%
c("Elec- Residential", "Elec- Commercial")
) %>%
mutate(
DATE =
paste(
YEAR,
MONTH,
"01",
sep="-"
) %>% as.Date()
) %>%
group_by(DATE, CUSTOMERCLASS) %>%
select(
-c(YEAR, MONTH, COMBINED, AVERAGEKWH)
) %>%
summarize(
TOTALKBTU =
sum(
3.41214*TOTALKWH,
na.rm = T
),
TOTALCUSTOMERS =
sum(
TOTALCUSTOMERS,
na.rm = T
)
) %>%
mutate(
AVERAGEKBTU =
TOTALKBTU/TOTALCUSTOMERS)
## `summarise()` has grouped output by 'DATE'. You can override using the `.groups` argument.
pge_final_elec
## # A tibble: 96 × 5
## # Groups: DATE [48]
## DATE CUSTOMERCLASS TOTALKBTU TOTALCUSTOMERS AVERAGEKBTU
## <date> <chr> <dbl> <dbl> <dbl>
## 1 2017-01-01 Elec- Commercial 8592222962. 312242 27518.
## 2 2017-01-01 Elec- Residential 9201286929. 4427009 2078.
## 3 2017-02-01 Elec- Commercial 7460336659. 312327 23886.
## 4 2017-02-01 Elec- Residential 7129343387. 4429320 1610.
## 5 2017-03-01 Elec- Commercial 7946075343. 312586 25420.
## 6 2017-03-01 Elec- Residential 6851151097. 4432096 1546.
## 7 2017-04-01 Elec- Commercial 7713240020. 313499 24604.
## 8 2017-04-01 Elec- Residential 6271880831. 4437095 1414.
## 9 2017-05-01 Elec- Commercial 8437405736. 313795 26888.
## 10 2017-05-01 Elec- Residential 7065368413. 4438946 1592.
## # … with 86 more rows
library(tidyverse)
library(plotly)
pge_chart_elec <-
pge_final_elec %>%
ggplot() +
geom_bar(
aes(
x = DATE,
y = TOTALKBTU,
fill = CUSTOMERCLASS
),
stat = "identity",
position = "stack"
) +
labs(
x = "Month",
y = "kBTU",
title = "PG&E Territory Monthly Electricity Usage, 2017-2020",
fill = "Electricity Type"
)
pge_chart_elec

pge_chart_elec %>% ggplotly()
pge_chart_elec %>%
ggplotly() %>%
layout(
xaxis = list(fixedrange = T),
yaxis = list(fixedrange = T)
) %>%
config(displayModeBar = F)